import urllib
import pandas as pd
Build the url for the advanced search
url_base='http://www.ebi.ac.uk/ena/data/warehouse/search?' #base for advanced search
url_query='\"tax_tree(11320)\"' #influenza A taxon and all subordinates (tree)
url_result='&result=sample' # looking for samples, they have location
url_count='&resultcount' # count the results
url=url_base+url_query+url_result+url_count
Query the url
print urllib.urlopen(url).read()
Build url
url_base='http://www.ebi.ac.uk/ena/data/warehouse/search?'
url_query='\"tax_tree(11320)\"'
url_result='&result=sample'
url_display='&display=report' #report is the tab separated output
url_fields='&fields=accession,location' #get accesion and location
url_limits='&offset=1&length=1095067' #get all the results
url=url_base+url_query+url_result+url_display+url_fields+url_limits
Download the table to a string
ena_flu_loco_page = urllib.urlopen(url).read()
Load the table into a pandas DataFrame
from StringIO import StringIO
ena_flu_loco_table = pd.read_csv(StringIO(ena_flu_loco_page),sep='\t')
Peek into the table
ena_flu_loco_table.head()
print "The number of sample with geolocations is: ",
print len(ena_flu_loco_table[ pd.isnull(ena_flu_loco_table['location']) == False ])
ena_loco=ena_flu_loco_table[ pd.isnull(ena_flu_loco_table['location']) == False ]
ena_loco.head()
err= ena_loco[ [ len(x.split(' '))!=4 for x in ena_loco['location'] ]]
err.head()
Delete these
ena_loco = ena_loco[ [ len(x.split(' '))==4 for x in ena_loco['location'] ]]
def parse_lat(string_loc):
loc_list=string_loc.split(' ')
if (loc_list[1] =='N'):
return float(loc_list[0])
elif (loc_list[1] =='S'):
return -float(loc_list[0])
def parse_lon(string_loc):
loc_list=string_loc.split(' ')
if (loc_list[3] =='E'):
return float(loc_list[2])
elif (loc_list[3] =='W'):
return -float(loc_list[2])
ena_loco['lat']=map(parse_lat,ena_loco['location'])
ena_loco['lon']=map(parse_lon,ena_loco['location'])
ena_loco.head()
uniq_locs=ena_loco.groupby(['lat','lon']).size().reset_index()
uniq_locs.columns=['lat','lon','count']
print 'Number of unique locations:',
print len(uniq_locs.sort('count', ascending=False))
def form_acc(x):
if (x['accession'].size < 5):
return pd.Series(
dict(count = x['accession'].size, acc_list = ' '.join(x['accession'])))
else:
return pd.Series(
dict(count = x['accession'].size, acc_list = ' '.join(list(
x['accession'])[:2]) + ' ... ' + ' '.join(list(
x['accession'])[-2:])))
uniq_locs_w_acc=ena_loco.groupby(['lat','lon']).apply(form_acc).reset_index()
from IPython.core.display import HTML
import folium
def inline_map(m, width=650, height=500):
"""Takes a folium instance and embed HTML."""
m._build_map()
srcdoc = m.HTML.replace('"', '"')
embed = HTML('<iframe srcdoc="{}" '
'style="width: {}px; height: {}px; '
'border: none"></iframe>'.format(srcdoc, width, height))
return embed
width, height = 650, 500
flu_map = folium.Map(location=[47, -17], zoom_start=3,
tiles='OpenStreetMap', width=width, height=height)
for i in xrange(len(uniq_locs_w_acc)):
loc=(uniq_locs_w_acc.iloc[i]['lat'],uniq_locs_w_acc.iloc[i]['lon'] )
name=uniq_locs_w_acc.iloc[i]['acc_list']
size=uniq_locs_w_acc.iloc[i]['count']
flu_map.circle_marker(location=loc, radius=100*size,
line_color='none',fill_color='#3186cc',
fill_opacity=0.7, popup=name)
inline_map(flu_map)